This notebook generates two ElasticSearch (ES) indexes with information about:
Let's start by importing the utils
python script, setting up the connection to the ES server and defining some variables
In [ ]:
import utils
utils.logging.basicConfig(level=utils.logging.INFO)
settings = utils.read_config_file('settings.yml')
es = utils.establish_connection(settings['es_host'])
Let's give a name to the indexes and create them in the ES server. Take care utils.create_ES_index()
deletes any existing index with the same name before creating it.
In [ ]:
index_name_git = 'github-git'
utils.create_ES_index(es, index_name_git, utils.MAPPING_GITHUB_GIT)
index_name_github_issues = 'github-issues'
utils.create_ES_index(es, index_name_github_issues, utils.MAPPING_GITHUB_ISSUES)
Let's import needed backends from Perceval
In [ ]:
from perceval.backends.core.git import Git
from perceval.backends.core.github import GitHub
For each repository in the settings file, get git related info and upload it to defined git ES index
In [ ]:
for repo_url in settings['github-repo']:
repo_owner = repo_url.split('/')[-2]
repo_name = repo_url.split('/')[-1]
repo_git_url = repo_url + '.git'
git_repo = Git(uri=repo_git_url, gitpath='/tmp/'+repo_name)
utils.logging.info('Parsing log from {}'.format(repo_name))
items = []
bulk_size = 10000
for commit in git_repo.fetch():
contributor_name = commit['data']['Author'].split('<')[0][:-1]
contributor_email_domain = commit['data']['Author'].split('@')[-1][:-1]
for file in commit['data']['files']:
if 'added' not in file.keys() or file['added'] == '-':
file['added'] = 0
if 'removed' not in file.keys() or file['removed'] == '-':
file['removed'] = 0
summary = {
'date': commit['data']['AuthorDate'],
'commit_id': commit['data']['commit'],
'contributor_name': contributor_name,
'contributor_email_domain': contributor_email_domain,
'file': file['file'],
'lines_added': file['added'],
'lines_removed': file['removed'],
'github_owner': repo_owner, 'github_repository': repo_name
}
items.append({'_index': index_name_git, '_type': 'item', '_source': summary})
if len(items) > bulk_size:
utils.helpers.bulk(es, items)
items = []
utils.logging.info('{} items uploaded'.format(bulk_size))
if len(items) != 0:
utils.helpers.bulk(es, items)
utils.logging.info('Remaining {} items uploaded'.format(len(items)))
For each repository in the settings file, get github issues related info and upload it to defined github issues ES index
In [ ]:
import datetime as datetime
for repo_url in settings['github-repo']:
repo_owner = repo_url.split('/')[-2]
repo_name = repo_url.split('/')[-1]
repo_git_url = repo_url + '.git'
github_repo = GitHub(owner=repo_owner, repository=repo_name, api_token=settings['github_token'])
utils.logging.info('Parsing issues from {}'.format(repo_name))
items = []
for issue in github_repo.fetch():
created_at = issue['data']['created_at']
#If the issue/pull-request is closed, we get the time to close it
if issue['data']['state'] == 'closed':
closed_at = issue['data']['closed_at']
creation_date = datetime.datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ")
closing_date = datetime.datetime.strptime(closed_at, "%Y-%m-%dT%H:%M:%SZ")
delta_time = (closing_date - creation_date).total_seconds()
else:
delta_time = None
summary = {
'date': created_at,
'title': issue['data']['title'],
'state': issue['data']['state'],
'url': issue['data']['html_url'],
'comments': issue['data']['comments'],
'closed_at': issue['data']['closed_at'],
'time_to_solve': delta_time,
'github_owner': repo_owner,
'github_repository': repo_name
}
""" If there is submitter name, we use it as contributor_name
If not, we use the github username as contributor_name
"""
if issue['data']['user_data']['name'] != None:
summary['contributor_name'] = issue['data']['user_data']['name']
else:
summary['contributor_name'] = issue['data']['user_data']['login']
""" If there is someone assigned, we try to get the name as assignee_name
If there is no name, we use the github username
"""
try:
summary['assignee_name'] = issue['data']['assignee_data']['name']
except:
try:
summary['assignee_name'] = issue['data']['assignee']['login']
except:
summary['assignee_name'] = None
# We check if the item is an issue or pull request
if 'pull_request' in issue['data'].keys():
summary['issue_type'] = 'pull-request'
else:
summary['issue_type'] = 'issue'
items.append({'_index': index_name_github_issues, '_type': 'item', '_source': summary})
if len(items) > bulk_size:
utils.helpers.bulk(es, items)
items = []
utils.logging.info('{} items uploaded'.format(bulk_size))
if len(items) != 0:
utils.helpers.bulk(es, items)
utils.logging.info('Remaining {} items uploaded'.format(len(items)))